# 기본 모델

In [21]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import os
csv_path = os.getenv("HOME") +"/aiffel/pokemon_eda/data/Pokemon.csv"
original_data = pd.read_csv(csv_path)

In [22]:
#feature과 target 분리 
original_data.columns

features = ['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation']
target = 'Legendary'
X= original_data[features]
y=original_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

#decision tree 모델 사용하기 

model = DecisionTreeClassifier(random_state=25)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#정확도 확인해보기

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.97      0.98      0.97       147
        True       0.73      0.62      0.67        13

    accuracy                           0.95       160
   macro avg       0.85      0.80      0.82       160
weighted avg       0.95      0.95      0.95       160



# 데이터 전처리 - recall 값 올리기

In [23]:
pokemon = original_data.copy()
legendary = pokemon[pokemon["Legendary"] == True].reset_index(drop=True)

###이름의 길이가 10이상인지
pokemon["name_count"] = pokemon["Name"].apply(lambda i: len(i))
pokemon["long_name"] = pokemon["name_count"] >= 10


###이름에 자주 쓰이는 토큰 추출
pokemon["Name_nospace"] = pokemon["Name"].apply(lambda i: i.replace(" ", ""))
pokemon["name_isalpha"] = pokemon["Name_nospace"].apply(lambda i: i.isalpha())

pokemon = pokemon.replace(to_replace="Nidoran♀", value="Nidoran X")
pokemon = pokemon.replace(to_replace="Nidoran♂", value="Nidoran Y")
pokemon = pokemon.replace(to_replace="Farfetch'd", value="Farfetchd")
pokemon = pokemon.replace(to_replace="Mr. Mime", value="Mr Mime")
pokemon = pokemon.replace(to_replace="Porygon2", value="Porygon Two")
pokemon = pokemon.replace(to_replace="Ho-oh", value="Ho Oh")
pokemon = pokemon.replace(to_replace="Mime Jr.", value="Mime Jr")
pokemon = pokemon.replace(to_replace="Porygon-Z", value="Porygon Z")
pokemon = pokemon.replace(to_replace="Zygarde50% Forme", value="Zygarde Forme")

pokemon["Name_nospace"] = pokemon["Name"].apply(lambda i: i.replace(" ", ""))
pokemon["name_isalpha"] = pokemon["Name_nospace"].apply(lambda i: i.isalpha())

import re

def tokenize(name):
    tokens = []
    name_split = name.split()
    for part_name in name_split:
        a = re.findall('[A-Z][a-z]*', part_name)
        tokens.extend(a)
    tokens
        
    return np.array(tokens)

all_tokens = list(legendary["Name"].apply(tokenize).values)

token_set = []
for token in all_tokens:
    token_set.extend(token)

#전설의 이름에 등장하는 토큰이 포켓몬 이름에 있는지 여부를 나타내는 ㅏㄹ럼   
from collections import Counter

most_common = Counter(token_set).most_common()

for token, _ in most_common:
    # pokemon[token] = ... 형식으로 사용하면 뒤에서 warning이 발생합니다
    pokemon[f"{token}"] = pokemon["Name"].str.contains(token)



###type1, 2 범주형 데이터 전처리
types = list(set(pokemon["Type 1"]))

for t in types:
    pokemon[t] = (pokemon["Type 1"] == t) | (pokemon["Type 2"] == t)
    
pokemon[[["Type 1", "Type 2"] + types][0]].head()


Unnamed: 0,Type 1,Type 2,Dragon,Rock,Steel,Flying,Ghost,Fire,Ground,Psychic,Normal,Poison,Grass,Ice,Dark,Electric,Bug,Fairy,Fighting,Water
0,Grass,Poison,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
1,Grass,Poison,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
2,Grass,Poison,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
3,Grass,Poison,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
4,Fire,,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False


# 베이스라인 모델 학습

In [24]:
#pokemon columns에서 문자열 데이터, 전처리 과정 중 사용했던 데이터, 이미 처리한 데이터 등 삭제

features = ['Total', 'HP', 'Attack', 'Defense','Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 
            'name_count','long_name', 'Forme', 'Mega', 'Mewtwo','Deoxys', 'Kyurem', 'Latias', 'Latios',
            'Kyogre', 'Groudon', 'Hoopa','Poison', 'Ground', 'Flying', 'Normal', 'Water', 'Fire',
            'Electric','Rock', 'Dark', 'Fairy', 'Steel', 'Ghost', 'Psychic', 'Ice', 'Bug', 'Grass', 'Dragon', 'Fighting']

target = "Legendary"
X = pokemon[features]
y=pokemon[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

model = DecisionTreeClassifier(random_state=25)

model.fit(X_train, y_train)
y_pred=model.predict(X_test)


#정확도 확인해보기

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.99      0.97      0.98       147
        True       0.73      0.85      0.79        13

    accuracy                           0.96       160
   macro avg       0.86      0.91      0.88       160
weighted avg       0.97      0.96      0.96       160



# 비교모델(decision tree 외)

In [25]:
random_forest=RandomForestClassifier(random_state=25)

random_forest.fit(X_train,y_train)
y_pred=random_forest.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.99      0.98      0.98       147
        True       0.79      0.85      0.81        13

    accuracy                           0.97       160
   macro avg       0.89      0.91      0.90       160
weighted avg       0.97      0.97      0.97       160



# 회고
* 느낀점: 세세한 내용에 대해 추가적으로 공부하면 좋을 것 갇다. 